In [ ]:
# NumPy
import numpy as np
# Pandas
import pandas as pd
# Matplotlib
import matplotlib.pyplot as plt
# BeautifulSoup
from bs4 import BeautifulSoup
# Request
import requests
# RegEX
import re
In [ ]:
!jupyter nbconvert --to html customer-segmentation-using-clustering.ipynb
In [ ]:
#Getting the url 
url = "https://en.wikipedia.org/wiki/List_of_Academy_Award-winning_films"
req = requests.get(url)
In [ ]:
#Checking the status
req
Out[ ]:
<Response [200]>
In [ ]:
#Getting the content
soup = BeautifulSoup(req.content)

DATA CLEANING

In [ ]:
Film = []
Year =[]
Award = []
Nomination = []
count = 0
for i in soup.findAll('td'):
  i = re.sub('^<td>.*">|<td>|</td>|<.*>|\n',"",str(i))
  if count == 0:
    Film.append(i)
    count += 1
  elif count == 1:
    Year.append(i)
    count += 1
  elif count == 2:
    Award.append(i)
    count += 1
  else:
    count = 0
    Nomination.append(i)
In [ ]:
df = pd.DataFrame({"Film":Film[:1332], "Years":Year[:1332], "Awards":Award[:1332], "Nominations":Nomination[:1332]})
df
Out[ ]:
Film Years Awards Nominations
0 Nomadland 21 3 6
1 The Father 21 2 6
2 Judas and the Black Messiah 21 2 6
3 Minari 21 1 6
4 Mank 21 2 10
... ... ... ... ...
1327 The Yankee Doodle Mouse 1943 1 1
1328 The Yearling 1946 2 7
1329 Yesterday, Today and Tomorrow 1964 1 1
1330 You Can't Take It with You 1938 2 7
1331 Zorba the Greek 1964 3 7

1332 rows × 4 columns

In [ ]:
df.head(10)
Out[ ]:
Film Years Awards Nominations
0 Nomadland 21 3 6
1 The Father 21 2 6
2 Judas and the Black Messiah 21 2 6
3 Minari 21 1 6
4 Mank 21 2 10
5 Sound of Metal 21 2 6
6 Ma Rainey's Black Bottom 21 2 5
7 Promising Young Woman 21 1 5
8 Tenet 21 1 2
9 Soul 21 2 3
In [ ]:
df.tail(10)
Out[ ]:
Film Years Awards Nominations
1322 World Without Sun 1964 1 1
1323 Wrestling Swordfish 32 1 1
1324 Written on the Wind 1956 1 3
1325 Wuthering Heights 1939 1 8
1326 Yankee Doodle Dandy 1942 3 8
1327 The Yankee Doodle Mouse 1943 1 1
1328 The Yearling 1946 2 7
1329 Yesterday, Today and Tomorrow 1964 1 1
1330 You Can't Take It with You 1938 2 7
1331 Zorba the Greek 1964 3 7